knitr::opts_chunk$set(root.dir = "/Users/gaby/Downloads/ex")
library(foreign)   
library(tidyverse) # a set of different packages 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## â„č Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(TSstudio)  # provides a set of tools for descriptive and predictive analysis of time series data
library(forecast)  # provides methods and tools for displaying and analyzing univariate time series forecasts 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(corrplot)  # provides a visual exploratory tool on correlation matrix 
## corrplot 0.92 loaded
library(ggplot2)   # dedicated to data visualization
library(tseries)   # time series analysis 
library(mFilter)   # implements several time series filters useful for smoothing and extracting trend and cyclical 
library(dygraphs)  # an interface that provides rich facilities for charing time - series
library(stats)     # functions for statistical calculations and random number generation
library(astsa)     # applied statistical time series analysis 
## 
## Attaching package: 'astsa'
## 
## The following object is masked from 'package:forecast':
## 
##     gas
library(xts)       # convert objects to time series format
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(zoo)       # displays time series of numeric vectors / matrices / factors
library(AER)       # applied econometrics with R
## Warning: package 'AER' was built under R version 4.3.2
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
library(plm)       # linear models for panel data
## 
## Attaching package: 'plm'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead
library(vars)      # estimation, lag selection, diagnostic testing, forecasting, causality analysis and estimation of
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: strucchange
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Loading required package: urca
library(dynlm)     # dynamic linear models and time series regression 
library(dplyr)     # useful package / tool for working with a data frame. It focuses on data manipulation
library(panelvar)  # provides a comprehensive framework for panel vector autoregression models
## Welcome to panelvar! Please cite our package in your publications -- see citation("panelvar")
## 
## Attaching package: 'panelvar'
## 
## The following object is masked from 'package:vars':
## 
##     stability
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(foreign)
library(RColorBrewer) # offers color palettes
library(lmtest)       # testing linear regression models
library(stargazer)    # creates well formatted regression tables
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(tseries)      # time series analysis 

SECCIÓN I En este proyecto se trabajará con una base de datos panel relacionada con los precios de casas en los diferentes estados de Estados Unidos. Se buscará encontrar un modelo que pueda predecir los precios considerando los factores de entidad y de tiempo.

setwd("/Users/gaby/Downloads/ex")
houses <- read_csv("House_Price_USA.csv")
## Rows: 1421 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): names, plate, region.name
## dbl (7): state, year, region, price, income, pop, intrate
## 
## â„č Use `spec()` to retrieve the full column specification for this data.
## â„č Specify the column types or set `show_col_types = FALSE` to quiet this message.
femsa<-read.csv("FEMSAUBD.csv")
femsa2024<-read.csv("FEMSAUBD.MX.csv")

Con el uso de la función summary se puede concluir que estamos trabajando con una base de datos panel balanceada. Asimismo, es posible encontrar las medidas de tendencia central. Se puede observar que el precio promedio es de 99.89, el ingreso promedio es de 9.9, la población promedio es de 5076286, y la tasa de interés promedio es de 4.3.

summary(houses)
##      state           year         names              plate          
##  Min.   : 1.0   Min.   :1975   Length:1421        Length:1421       
##  1st Qu.:18.0   1st Qu.:1982   Class :character   Class :character  
##  Median :30.0   Median :1989   Mode  :character   Mode  :character  
##  Mean   :29.8   Mean   :1989                                        
##  3rd Qu.:42.0   3rd Qu.:1996                                        
##  Max.   :56.0   Max.   :2003                                        
##      region      region.name            price            income      
##  Min.   :1.000   Length:1421        Min.   : 58.09   Min.   : 5.910  
##  1st Qu.:3.000   Class :character   1st Qu.: 87.55   1st Qu.: 8.677  
##  Median :5.000   Mode  :character   Median : 96.87   Median : 9.718  
##  Mean   :4.327                      Mean   : 99.89   Mean   : 9.933  
##  3rd Qu.:6.000                      3rd Qu.:108.06   3rd Qu.:11.099  
##  Max.   :8.000                      Max.   :224.12   Max.   :18.219  
##       pop              intrate      
##  Min.   :  380477   Min.   :-5.544  
##  1st Qu.: 1472595   1st Qu.: 3.419  
##  Median : 3495939   Median : 4.572  
##  Mean   : 5076286   Mean   : 4.363  
##  3rd Qu.: 5840774   3rd Qu.: 5.687  
##  Max.   :35484453   Max.   :11.225
regions <- unique(houses$region.name)
print(regions)
## [1] "Plains"         "Southwest"      "Far West"       "Rocky Mountain"
## [5] "New England"    "Mideast"        "Southeast"      "Great Lakes"

Se convertirån las regiones a valores numéricos

houses$region.name[houses$region.name ==1] ="Plains"
houses$region.name[houses$region.name ==2] ="Southwest"
houses$region.name[houses$region.name ==3] ="Far West"
houses$region.name[houses$region.name ==4] ="Rocky Mountain"
houses$region.name[houses$region.name ==5] ="New England"
houses$region.name[houses$region.name ==6] ="Mideast"
houses$region.name[houses$region.name ==7] ="Southeast"
houses$region.name[houses$region.name ==8] ="Great Lakes"

Se transforma la variable de regiĂłn a categĂłrica

houses$region.name <- as.factor(houses$region.name) 
houses$names <- as.factor(houses$names) 
summary(houses)
##      state           year              names         plate          
##  Min.   : 1.0   Min.   :1975   Alabama    :  29   Length:1421       
##  1st Qu.:18.0   1st Qu.:1982   Arizona    :  29   Class :character  
##  Median :30.0   Median :1989   Arkansas   :  29   Mode  :character  
##  Mean   :29.8   Mean   :1989   California :  29                     
##  3rd Qu.:42.0   3rd Qu.:1996   Colorado   :  29                     
##  Max.   :56.0   Max.   :2003   Connecticut:  29                     
##                                (Other)    :1247                     
##      region              region.name      price            income      
##  Min.   :1.000   Plains        :348   Min.   : 58.09   Min.   : 5.910  
##  1st Qu.:3.000   Great Lakes   :203   1st Qu.: 87.55   1st Qu.: 8.677  
##  Median :5.000   Mideast       :174   Median : 96.87   Median : 9.718  
##  Mean   :4.327   New England   :174   Mean   : 99.89   Mean   : 9.933  
##  3rd Qu.:6.000   Rocky Mountain:145   3rd Qu.:108.06   3rd Qu.:11.099  
##  Max.   :8.000   Southeast     :145   Max.   :224.12   Max.   :18.219  
##                  (Other)       :232                                    
##       pop              intrate      
##  Min.   :  380477   Min.   :-5.544  
##  1st Qu.: 1472595   1st Qu.: 3.419  
##  Median : 3495939   Median : 4.572  
##  Mean   : 5076286   Mean   : 4.363  
##  3rd Qu.: 5840774   3rd Qu.: 5.687  
##  Max.   :35484453   Max.   :11.225  
## 

A través de las medidas de dispersión se puede observar que la población tiene una mayor dispersión de datos. A través de las medidas de rango de las diferentes variables es posible observar que los datos no tienen una gran dispersión y en realidad estån dentro de un rango pequeño. Al comparar esto con las medidas de tendencia central también se puede inferir que habrån datos atípicos debido a que el rango IQR es pequeño pero hay variables con una diferencia del måximo y el mínimo relativamente grande.

varianza<-var(houses$price)
ds<-sd(houses$price)
rango<-diff(range(houses$price))
iqr<-IQR(houses$price)
print(varianza)
## [1] 386.3875
print(ds)
## [1] 19.65674
print(rango)
## [1] 166.0264
print(iqr)
## [1] 20.50941
varianza<-var(houses$income)
ds<-sd(houses$income)
rango<-diff(range(houses$income))
iqr<-IQR(houses$income)
print(varianza)
## [1] 3.111296
print(ds)
## [1] 1.763887
print(rango)
## [1] 12.30894
print(iqr)
## [1] 2.421148
varianza<-var(houses$pop)
ds<-sd(houses$pop)
rango<-diff(range(houses$pop))
iqr<-IQR(houses$pop)
print(varianza)
## [1] 2.945701e+13
print(ds)
## [1] 5427432
print(rango)
## [1] 35103976
print(iqr)
## [1] 4368179
varianza<-var(houses$intrate)
ds<-sd(houses$intrate)
rango<-diff(range(houses$intrate))
iqr<-IQR(houses$intrate)
print(varianza)
## [1] 6.745633
print(ds)
## [1] 2.597236
print(rango)
## [1] 16.76963
print(iqr)
## [1] 2.268536

Se puede observar que las variables tienen sesgos, específicamente la de la población. Las otras variables, a pesar de tener una distribución relativamente normal, también tienen sesgos que pueden afectar el impacto de las variables en el modelo final.

hist(houses$price)

hist(houses$income)

hist(houses$pop)

hist(houses$intrate)

A través del diagrama de caja, se pueden observar datos atípicos, lo que se puede relacionar con los sesgos vistos en el histograma. De cualquier manera, a través de la caja se puede observar que la mayoría de los datos estån concentrados dentro del rango.

boxplot(houses$price)

boxplot(houses$income)

boxplot(houses$pop)

boxplot(houses$intrate)

A través del mapa se puede observar el precio promedio de casas por estado. Visualmente no se puede observar un impacto significativo por regiones ya sea en el centro, o cerca de la frontera, o mås.

library(dplyr)
library(usmap)
library(usmapdata)
## Warning: package 'usmapdata' was built under R version 4.3.2
## 
## Attaching package: 'usmapdata'
## The following object is masked from 'package:usmap':
## 
##     us_map
houses_state <- houses %>% group_by(state) %>% mutate(price = mean(price)) %>% arrange(price)
houses_state$fips <- sprintf("%02d", houses_state$state)
state_id <- usmapdata::fips_data("state")
houses_pd_join <- merge(houses_state, state_id, by.x = "fips", by.y = "fips")
plot_usmap(data = houses_pd_join, values = "price")

A través del gråfico de la serie de tiempo de los precios de hogares se puede observar que la serie no es estacionaria y un gran declive de precios a partir de 1978.

ts_prices<- ts(houses$price, start = c(1975), end = c(2003), frequency = 1) 
ts_plot(ts_prices, line.mode = "lines+markers", dash = "dash", Xtitle = "Year", Ytitle = "Dollars", title = "House prices")

Se hizo la prueba de Dickey Fuller para poder confirmar que la serie no es estacionaria. Debido a que el valor de p es mayor a 0.05 no serĂ­a considerada estacionaria.

adf.test(ts_prices,alternative="stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_prices
## Dickey-Fuller = -2.2381, Lag order = 3, p-value = 0.4813
## alternative hypothesis: stationary

Se generĂł un modelo lineal base tomando a consideraciĂłn todas las variables incluidas en el dataframe.

model1<-lm(price ~ income + pop + intrate + region, data= houses)
summary(model1)
## 
## Call:
## lm(formula = price ~ income + pop + intrate + region, data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.501 -10.445  -0.603   8.589  92.005 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.749e+01  2.789e+00  27.782  < 2e-16 ***
## income       4.235e+00  2.519e-01  16.811  < 2e-16 ***
## pop          4.733e-07  7.938e-08   5.962 3.13e-09 ***
## intrate     -1.862e+00  1.586e-01 -11.738  < 2e-16 ***
## region      -3.222e+00  2.018e-01 -15.967  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.39 on 1416 degrees of freedom
## Multiple R-squared:  0.3889, Adjusted R-squared:  0.3871 
## F-statistic: 225.3 on 4 and 1416 DF,  p-value: < 2.2e-16

Debido a que los valores son menores a 5, se puede decir que la multicolinealidad no es un problema.

vif(model1)  
##   income      pop  intrate   region 
## 1.183783 1.113042 1.017722 1.071238

Se llevaron a cabo modelos de efectos fijos con todas las variables y es posible observar que los efectos de tiempo son importantes ya que es el modelo que tuvo mejor desempeño.

panel_model1<-plm(price ~ income + pop + intrate + region, data= houses, model="within",effect="twoways")
panel_model2<-plm(price ~ income + pop + intrate + region, data= houses, model="within",effect="individual")
panel_model3<-plm(price ~ income + pop + intrate + region, data= houses, model="within",effect="time") #mejor
summary(panel_model1) 
## Twoways effects Within Model
## 
## Call:
## plm(formula = price ~ income + pop + intrate + region, data = houses, 
##     effect = "twoways", model = "within")
## 
## Balanced Panel: n = 49, T = 29, N = 1421
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -43.16967  -6.21685  -0.80249   5.92160  48.92844 
## 
## Coefficients:
##            Estimate  Std. Error t-value  Pr(>|t|)    
## income   1.2298e+01  6.3250e-01 19.4438 < 2.2e-16 ***
## pop      1.7281e-06  3.7314e-07  4.6314 3.986e-06 ***
## intrate -3.0574e+00  3.1695e-01 -9.6464 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    224730
## Residual Sum of Squares: 158980
## R-Squared:      0.29256
## Adj. R-Squared: 0.25088
## F-statistic: 184.855 on 3 and 1341 DF, p-value: < 2.22e-16
summary(panel_model2) 
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price ~ income + pop + intrate + region, data = houses, 
##     effect = "individual", model = "within")
## 
## Balanced Panel: n = 49, T = 29, N = 1421
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -47.9442  -7.8578  -1.5584   6.9834  59.9015 
## 
## Coefficients:
##            Estimate  Std. Error  t-value Pr(>|t|)    
## income   5.2096e+00  2.9520e-01  17.6480   <2e-16 ***
## pop      5.0925e-07  4.0803e-07   1.2481   0.2122    
## intrate -1.8838e+00  1.2743e-01 -14.7838   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    287680
## Residual Sum of Squares: 204970
## R-Squared:      0.28751
## Adj. R-Squared: 0.26097
## F-statistic: 184.146 on 3 and 1369 DF, p-value: < 2.22e-16
summary(panel_model3) 
## Oneway (time) effect Within Model
## 
## Call:
## plm(formula = price ~ income + pop + intrate + region, data = houses, 
##     effect = "time", model = "within")
## 
## Balanced Panel: n = 49, T = 29, N = 1421
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -40.5645 -10.2410  -0.3150   8.9653  79.4120 
## 
## Coefficients:
##            Estimate  Std. Error  t-value  Pr(>|t|)    
## income   4.3061e+00  3.3365e-01  12.9062 < 2.2e-16 ***
## pop      5.0149e-07  7.8273e-08   6.4069 2.029e-10 ***
## intrate -3.9605e+00  4.2027e-01  -9.4238 < 2.2e-16 ***
## region  -3.2921e+00  2.0095e-01 -16.3823 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    485710
## Residual Sum of Squares: 305970
## R-Squared:      0.37007
## Adj. R-Squared: 0.35554
## F-statistic: 203.851 on 4 and 1388 DF, p-value: < 2.22e-16

También se generaron modelos utilizando efectos aleatorios y OLS. A través de los mismos se puede observar que el de efectos fijos continua teniendo el mejor desempeño.

panel_model4<-plm(price ~ income + pop + intrate + region, data= houses, model="random") 
summary(panel_model4)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = price ~ income + pop + intrate + region, data = houses, 
##     model = "random")
## 
## Balanced Panel: n = 49, T = 29, N = 1421
## 
## Effects:
##                   var std.dev share
## idiosyncratic 149.724  12.236  0.63
## individual     87.752   9.368  0.37
## theta: 0.7643
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -42.8450  -8.1036  -1.3078   6.9597  66.9573 
## 
## Coefficients:
##                Estimate  Std. Error  z-value  Pr(>|z|)    
## (Intercept)  6.8036e+01  4.1727e+00  16.3051 < 2.2e-16 ***
## income       5.1359e+00  2.7121e-01  18.9369 < 2.2e-16 ***
## pop          4.3882e-07  2.2261e-07   1.9713   0.04869 *  
## intrate     -1.8781e+00  1.2729e-01 -14.7537 < 2.2e-16 ***
## region      -3.0488e+00  6.6207e-01  -4.6050 4.125e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    302190
## Residual Sum of Squares: 212410
## R-Squared:      0.29708
## Adj. R-Squared: 0.29509
## Chisq: 598.441 on 4 DF, p-value: < 2.22e-16
panel_model5<-plm(price ~ income + pop + intrate + region, data= houses, model="pooling") 
summary(panel_model5)
## Pooling Model
## 
## Call:
## plm(formula = price ~ income + pop + intrate + region, data = houses, 
##     model = "pooling")
## 
## Balanced Panel: n = 49, T = 29, N = 1421
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -41.50070 -10.44536  -0.60276   8.58930  92.00531 
## 
## Coefficients:
##                Estimate  Std. Error  t-value  Pr(>|t|)    
## (Intercept)  7.7494e+01  2.7894e+00  27.7816 < 2.2e-16 ***
## income       4.2345e+00  2.5189e-01  16.8110 < 2.2e-16 ***
## pop          4.7329e-07  7.9379e-08   5.9624 3.134e-09 ***
## intrate     -1.8618e+00  1.5862e-01 -11.7375 < 2.2e-16 ***
## region      -3.2223e+00  2.0181e-01 -15.9672 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    548670
## Residual Sum of Squares: 335310
## R-Squared:      0.38887
## Adj. R-Squared: 0.38715
## F-statistic: 225.257 on 4 and 1416 DF, p-value: < 2.22e-16

Tests de diagnĂłstico Debido a que el valor de p es menor al 5% el modelo con efectos fijos es mejor que el OLS

pFtest(panel_model3, model1)  
## 
##  F test for time effects
## 
## data:  price ~ income + pop + intrate + region
## F = 4.7538, df1 = 28, df2 = 1388, p-value = 8.678e-15
## alternative hypothesis: significant effects

Debido a que el valor de p es menor al 5% es recomendable considerar efectos fijos con factor de tiempo

pFtest(panel_model3, panel_model5) 
## 
##  F test for time effects
## 
## data:  price ~ income + pop + intrate + region
## F = 4.7538, df1 = 28, df2 = 1388, p-value = 8.678e-15
## alternative hypothesis: significant effects

Debido a que el valor de p es menor a 5% es recomendable usar el modelo de efectos fijos sobre el de efectos aleatorios.

phtest(panel_model3, panel_model4) 
## 
##  Hausman Test
## 
## data:  price ~ income + pop + intrate + region
## chisq = 37.719, df = 4, p-value = 1.281e-07
## alternative hypothesis: one model is inconsistent

Debido a que el valor de p es menor a 5%, los efectos aleatorios son recomendables sobre OLS

plmtest(panel_model5, type=c("bp")) 
## 
##  Lagrange Multiplier Test - (Breusch-Pagan)
## 
## data:  price ~ income + pop + intrate + region
## chisq = 2581.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: significant effects

Debido a que en el test de Breusch Pagan el valor de p es menor al 5% hay heteroscedasticidad Debido a que los coeficientes tienen un valor menor a 0.05, son significativos Debido a que los coeficientes son menores a 0.05 se concluye que hay una relación entre las variables predictoras de ingresos, tasa de interés y región y la variable respuesta. Debido a que el de población no salió significativo la heteroscedasticidad puede estar afectando el error eståndar. Debido a que el valor de p en el test de Breusch-Godfrey/Wooldridge es menor a 0.05 se encuentra autocorrelación serial.

bptest(panel_model3)
## 
##  studentized Breusch-Pagan test
## 
## data:  panel_model3
## BP = 91.923, df = 4, p-value < 2.2e-16
coeftest(panel_model3)
## 
## t test of coefficients:
## 
##            Estimate  Std. Error  t value  Pr(>|t|)    
## income   4.3061e+00  3.3365e-01  12.9062 < 2.2e-16 ***
## pop      5.0149e-07  7.8273e-08   6.4069 2.029e-10 ***
## intrate -3.9605e+00  4.2027e-01  -9.4238 < 2.2e-16 ***
## region  -3.2921e+00  2.0095e-01 -16.3823 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(panel_model3, vcovHC)
## 
## t test of coefficients:
## 
##            Estimate  Std. Error t value  Pr(>|t|)    
## income   4.3061e+00  1.0531e+00  4.0889 4.583e-05 ***
## pop      5.0149e-07  2.6927e-07  1.8624   0.06276 .  
## intrate -3.9605e+00  9.1453e-01 -4.3306 1.594e-05 ***
## region  -3.2921e+00  7.2379e-01 -4.5484 5.877e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pbgtest(panel_model3)
## 
##  Breusch-Godfrey/Wooldridge test for serial correlation in panel models
## 
## data:  price ~ income + pop + intrate + region
## chisq = 1109.4, df = 29, p-value < 2.2e-16
## alternative hypothesis: serial correlation in idiosyncratic errors

Correción de modelos Se utiliza el método de Arellano para lidiar con la heteroscedasticidad y la correlación serial. Se obtienen nuevos coeficientes en los que la variable de población ya no es significativa.

coeftest(panel_model3, vcovHC(panel_model3, method = "arellano")) 
## 
## t test of coefficients:
## 
##            Estimate  Std. Error t value  Pr(>|t|)    
## income   4.3061e+00  1.0531e+00  4.0889 4.583e-05 ***
## pop      5.0149e-07  2.6927e-07  1.8624   0.06276 .  
## intrate -3.9605e+00  9.1453e-01 -4.3306 1.594e-05 ***
## region  -3.2921e+00  7.2379e-01 -4.5484 5.877e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coefficients_arellano <- c(income = 4.3061e+00, intrate = -3.9605e+00, region = -3.2921e+00)
modelo3_1 <- function(houses, coefficients) {
  income <- houses$income
  intrate <- houses$intrate
  region <- houses$region
  nuevo <- lm(price ~ income + intrate + region, data = houses)
  return(nuevo)
}
modelo_final <- modelo3_1(houses,coefficients_arellano)
summary(modelo_final)
## 
## Call:
## lm(formula = price ~ income + intrate + region, data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.497 -10.519  -0.533   8.779  91.388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  74.5434     2.7784   26.83   <2e-16 ***
## income        4.7010     0.2423   19.40   <2e-16 ***
## intrate      -1.8568     0.1605  -11.57   <2e-16 ***
## region       -3.0610     0.2024  -15.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 1417 degrees of freedom
## Multiple R-squared:  0.3735, Adjusted R-squared:  0.3722 
## F-statistic: 281.6 on 3 and 1417 DF,  p-value: < 2.2e-16
AIC(modelo_final)
## [1] 11841.76

Al comparar los coeficientes del modelo seleccionado (efectos fijos con efectos de tiempo) y con los coeficientes de Arellano que intentan lidiar con la heteroscedasticidad y con la autocorrelación serial del modelo, se puede observar que el modelo con los coeficientes de Arellano tiene mejor desempeño. Asimismo, es importante recordar que se eliminó la variable de población con los coeficientes de Arellano ya que resultó no ser una variable significativa. Ambos modelos tienen un buen desempeño, sin embargo, se obtuvo una mejor R cuadrada ajustada con los coeficientes de Arellano y sin considerar la variable de población. Esto lo hace un modelo eficiente y simple. El hecho de que los efectos fijos y efectos de tiempo tuvieran el mejor desempeño indica que las variables no cambian a grandes razgos a través del tiempo y que estas mismas estån correlacionadas con la variable dependiente. Al obtener una R cuadrada ajustada de 0.372 se puede decir que aproximadamente el 37.2% de la variabilidad de los precios de hogares en Estados Unidos es explicada por las variables independientes incluidas en el modelo.

stargazer(panel_model3, modelo_final, title="Panel Regression Analysis", type="text", column.labels = c("Original","Arellano"))
## 
## Panel Regression Analysis
## =======================================================================
##                                     Dependent variable:                
##                     ---------------------------------------------------
##                                            price                       
##                               panel                      OLS           
##                              linear                                    
##                             Original                  Arellano         
##                                (1)                       (2)           
## -----------------------------------------------------------------------
## income                      4.306***                  4.701***         
##                              (0.334)                   (0.242)         
##                                                                        
## pop                        0.00000***                                  
##                             (0.00000)                                  
##                                                                        
## intrate                     -3.961***                 -1.857***        
##                              (0.420)                   (0.161)         
##                                                                        
## region                      -3.292***                 -3.061***        
##                              (0.201)                   (0.202)         
##                                                                        
## Constant                                              74.543***        
##                                                        (2.778)         
##                                                                        
## -----------------------------------------------------------------------
## Observations                  1,421                     1,421          
## R2                            0.370                     0.374          
## Adjusted R2                   0.356                     0.372          
## Residual Std. Error                              15.575 (df = 1417)    
## F Statistic         203.851*** (df = 4; 1388) 281.625*** (df = 3; 1417)
## =======================================================================
## Note:                                       *p<0.1; **p<0.05; ***p<0.01

Es importante tomar en cuenta que el modelo cuenta con heteroscedasticidad, lo que indica que el modelo no es totalmente preciso.

bptest(modelo_final)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_final
## BP = 94.916, df = 3, p-value < 2.2e-16

SECCIÓN II

Se puede observar que las acciones de FEMSA tienen un buen desempeño en general debido a que todos los valores son mayores a 0. Esto se puede observar a través de los mínimos.

summary(femsa)
##      Date                Open            High            Low       
##  Length:365         Min.   :113.7   Min.   :119.9   Min.   :112.7  
##  Class :character   1st Qu.:155.9   1st Qu.:159.8   1st Qu.:151.1  
##  Mode  :character   Median :170.3   Median :174.2   Median :166.8  
##                     Mean   :166.4   Mean   :170.3   Mean   :162.6  
##                     3rd Qu.:178.7   3rd Qu.:181.9   3rd Qu.:175.5  
##                     Max.   :226.7   Max.   :228.1   Max.   :221.4  
##      Close         Adj.Close         Volume        
##  Min.   :113.7   Min.   :107.1   Min.   : 2483101  
##  1st Qu.:154.8   1st Qu.:146.8   1st Qu.: 9359281  
##  Median :170.7   Median :157.4   Median :12271161  
##  Mean   :166.4   Mean   :156.4   Mean   :13533991  
##  3rd Qu.:178.8   3rd Qu.:165.8   3rd Qu.:16248080  
##  Max.   :223.0   Max.   :223.0   Max.   :41518042

A través de los histogramas se puede observar una distribución relativamente normal. Es posible observar sesgos en la mayoría de las variables pero de cualquier manera, los datos se ven concentrados.

hist(femsa$Open)

hist(femsa$High)

hist(femsa$Low)

hist(femsa$Close)

hist(femsa$Adj.Close)

hist(femsa$Volume)

A través de los diagramas de caja se puede observar que los valores estån concentrados ya que el rango IQR es pequeño. De cualquier manera, se pueden observar valores atípicos en todas las variables.

boxplot(femsa$Open)

boxplot(femsa$High)

boxplot(femsa$Low)

boxplot(femsa$Close)

boxplot(femsa$Adj.Close)

boxplot(femsa$Volume)

femsa$Date<-as.Date(femsa$Date)
ts_open<-ts(femsa$Open, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_open,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "Open")
ts_high<-ts(femsa$High, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_high,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "High")
ts_low<-ts(femsa$Low, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_low,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "Low")
ts_close<-ts(femsa$Close, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_close,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "Close")
ts_adj_close<-ts(femsa$Adj.Close, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_adj_close,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "Adj Close")
ts_volume<-ts(femsa$Volume, start = c(2017), end = c(2023), frequency = 52) 
ts_plot(ts_volume,line.mode = "lines", Xtitle = "Year", Ytitle = "$MXN", title = "Volume")
adf.test(ts_open) #Debido a que el valor de p es mayor a 0.05 no es estacionaria. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_open
## Dickey-Fuller = -2.7561, Lag order = 6, p-value = 0.2576
## alternative hypothesis: stationary
acf(ts_open,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

adf.test(ts_high) #Debido a que el valor de p es mayor a 0.05 no es estacionaria. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_high
## Dickey-Fuller = -2.9031, Lag order = 6, p-value = 0.1957
## alternative hypothesis: stationary
acf(ts_high,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

adf.test(ts_low) #Debido a que el valor de p es mayor a 0.05 no es estacionaria. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_low
## Dickey-Fuller = -2.8939, Lag order = 6, p-value = 0.1995
## alternative hypothesis: stationary
acf(ts_low,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

adf.test(ts_close) #Debido a que el valor de p es mayor a 0.05 no es estacionaria. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_close
## Dickey-Fuller = -2.7398, Lag order = 6, p-value = 0.2645
## alternative hypothesis: stationary
acf(ts_close,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

adf.test(ts_adj_close) #Debido a que el valor de p es mayor a 0.05 no es estacionaria. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_adj_close
## Dickey-Fuller = -2.7065, Lag order = 6, p-value = 0.2786
## alternative hypothesis: stationary
acf(ts_high,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

adf.test(ts_volume) #Debido a que el valor de p es menor a 0.05 es estacionaria. 
## Warning in adf.test(ts_volume): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_volume
## Dickey-Fuller = -5.3174, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(ts_volume,main="Significant Autocorrelation")     #Hay autocorrelaciĂłn serial

A través de la descomposición de la serie de tiempo de adjusted price se puede observar un componente de estacionalidad y de aleatoriedad. Sin embargo, no se puede decir que hay una tendencia de crecimiento. En general, se pueden observar fluctuaciones a través del tiempo.

descpr.ts <- decompose(ts_adj_close)
plot(descpr.ts)

dec<-decompose(ts_adj_close)
dec$seasonal
## Time Series:
## Start = c(2017, 1) 
## End = c(2023, 1) 
## Frequency = 52 
##   [1]  4.0348029539  4.8417270635  4.9683294193  4.4635230347 -0.8778866192
##   [6] -1.6708167884  0.1592405443  0.2740124904 -1.8779012423  1.3997419443
##  [11] -0.9780809000 -2.7854474307 -3.0330279557 -0.7892001730  0.3809913943
##  [16] -0.8494567096 -0.2420675769  2.0054329481 -0.3638124596  0.7160666154
##  [21]  0.3596860904 -0.2029091673  4.3454091347  1.7143792404  0.8296320366
##  [26] -0.6696314807  1.4819817747  1.4712845058  1.3045596289 -0.0002192192
##  [31]  0.4745892904  0.0162143058  1.2670563135 -0.0367315076  1.5411875308
##  [36] -0.6542197923 -1.5786618769 -0.9705394192  0.1966342193  0.2026716212
##  [41] -0.6875401538 -2.2688479923 -3.6899227576 -8.1490215000 -4.6434535788
##  [46] -1.4606627576 -0.7378268288 -0.7462711057 -1.1535684538  0.1100357943
##  [51]  0.9771798962  1.5813556558  4.0348029539  4.8417270635  4.9683294193
##  [56]  4.4635230347 -0.8778866192 -1.6708167884  0.1592405443  0.2740124904
##  [61] -1.8779012423  1.3997419443 -0.9780809000 -2.7854474307 -3.0330279557
##  [66] -0.7892001730  0.3809913943 -0.8494567096 -0.2420675769  2.0054329481
##  [71] -0.3638124596  0.7160666154  0.3596860904 -0.2029091673  4.3454091347
##  [76]  1.7143792404  0.8296320366 -0.6696314807  1.4819817747  1.4712845058
##  [81]  1.3045596289 -0.0002192192  0.4745892904  0.0162143058  1.2670563135
##  [86] -0.0367315076  1.5411875308 -0.6542197923 -1.5786618769 -0.9705394192
##  [91]  0.1966342193  0.2026716212 -0.6875401538 -2.2688479923 -3.6899227576
##  [96] -8.1490215000 -4.6434535788 -1.4606627576 -0.7378268288 -0.7462711057
## [101] -1.1535684538  0.1100357943  0.9771798962  1.5813556558  4.0348029539
## [106]  4.8417270635  4.9683294193  4.4635230347 -0.8778866192 -1.6708167884
## [111]  0.1592405443  0.2740124904 -1.8779012423  1.3997419443 -0.9780809000
## [116] -2.7854474307 -3.0330279557 -0.7892001730  0.3809913943 -0.8494567096
## [121] -0.2420675769  2.0054329481 -0.3638124596  0.7160666154  0.3596860904
## [126] -0.2029091673  4.3454091347  1.7143792404  0.8296320366 -0.6696314807
## [131]  1.4819817747  1.4712845058  1.3045596289 -0.0002192192  0.4745892904
## [136]  0.0162143058  1.2670563135 -0.0367315076  1.5411875308 -0.6542197923
## [141] -1.5786618769 -0.9705394192  0.1966342193  0.2026716212 -0.6875401538
## [146] -2.2688479923 -3.6899227576 -8.1490215000 -4.6434535788 -1.4606627576
## [151] -0.7378268288 -0.7462711057 -1.1535684538  0.1100357943  0.9771798962
## [156]  1.5813556558  4.0348029539  4.8417270635  4.9683294193  4.4635230347
## [161] -0.8778866192 -1.6708167884  0.1592405443  0.2740124904 -1.8779012423
## [166]  1.3997419443 -0.9780809000 -2.7854474307 -3.0330279557 -0.7892001730
## [171]  0.3809913943 -0.8494567096 -0.2420675769  2.0054329481 -0.3638124596
## [176]  0.7160666154  0.3596860904 -0.2029091673  4.3454091347  1.7143792404
## [181]  0.8296320366 -0.6696314807  1.4819817747  1.4712845058  1.3045596289
## [186] -0.0002192192  0.4745892904  0.0162143058  1.2670563135 -0.0367315076
## [191]  1.5411875308 -0.6542197923 -1.5786618769 -0.9705394192  0.1966342193
## [196]  0.2026716212 -0.6875401538 -2.2688479923 -3.6899227576 -8.1490215000
## [201] -4.6434535788 -1.4606627576 -0.7378268288 -0.7462711057 -1.1535684538
## [206]  0.1100357943  0.9771798962  1.5813556558  4.0348029539  4.8417270635
## [211]  4.9683294193  4.4635230347 -0.8778866192 -1.6708167884  0.1592405443
## [216]  0.2740124904 -1.8779012423  1.3997419443 -0.9780809000 -2.7854474307
## [221] -3.0330279557 -0.7892001730  0.3809913943 -0.8494567096 -0.2420675769
## [226]  2.0054329481 -0.3638124596  0.7160666154  0.3596860904 -0.2029091673
## [231]  4.3454091347  1.7143792404  0.8296320366 -0.6696314807  1.4819817747
## [236]  1.4712845058  1.3045596289 -0.0002192192  0.4745892904  0.0162143058
## [241]  1.2670563135 -0.0367315076  1.5411875308 -0.6542197923 -1.5786618769
## [246] -0.9705394192  0.1966342193  0.2026716212 -0.6875401538 -2.2688479923
## [251] -3.6899227576 -8.1490215000 -4.6434535788 -1.4606627576 -0.7378268288
## [256] -0.7462711057 -1.1535684538  0.1100357943  0.9771798962  1.5813556558
## [261]  4.0348029539  4.8417270635  4.9683294193  4.4635230347 -0.8778866192
## [266] -1.6708167884  0.1592405443  0.2740124904 -1.8779012423  1.3997419443
## [271] -0.9780809000 -2.7854474307 -3.0330279557 -0.7892001730  0.3809913943
## [276] -0.8494567096 -0.2420675769  2.0054329481 -0.3638124596  0.7160666154
## [281]  0.3596860904 -0.2029091673  4.3454091347  1.7143792404  0.8296320366
## [286] -0.6696314807  1.4819817747  1.4712845058  1.3045596289 -0.0002192192
## [291]  0.4745892904  0.0162143058  1.2670563135 -0.0367315076  1.5411875308
## [296] -0.6542197923 -1.5786618769 -0.9705394192  0.1966342193  0.2026716212
## [301] -0.6875401538 -2.2688479923 -3.6899227576 -8.1490215000 -4.6434535788
## [306] -1.4606627576 -0.7378268288 -0.7462711057 -1.1535684538  0.1100357943
## [311]  0.9771798962  1.5813556558  4.0348029539
plot(dec$seasonal)

Al intentar construir modelos ARMA primero se buscarĂĄ hacerlo con el logaritmo de la serie de tiempo no estacionaria. La transformaciĂłn logarĂ­tmica no fue suficiente e incluso se puede observar un AIC negativo.

#AIC negativo 
modelo0<-auto.arima(log(ts_adj_close),stationary=T,seasonal=T,stepwise=T)
summary(modelo0)
## Series: log(ts_adj_close) 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1   mean
##       0.9522  5.013
## s.e.  0.0164  0.034
## 
## sigma^2 = 0.0009359:  log likelihood = 647.13
## AIC=-1288.25   AICc=-1288.17   BIC=-1277.01
## 
## Training set error measures:
##                        ME       RMSE        MAE         MPE      MAPE      MASE
## Training set 0.0003316132 0.03049417 0.02181572 0.002746035 0.4379021 0.1640419
##                     ACF1
## Training set -0.04451921
ndiffs(log(ts_adj_close))
## [1] 1
ts_adj_diff=diff(log(ts_adj_close),differences=1)

Al aplicar las diferencias para poder construir un modelo ARMA se vuelve a observar que la serie de tiempo tiene un AIC negativo. Esto indica que el modelo no es bueno ya que los valores pueden estar overfitted.

#AIC negativo
modelo0_1<-auto.arima(ts_adj_diff,stationary=T,seasonal=T,stepwise=T)
summary(modelo0_1)
## Series: ts_adj_diff 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 0.000954:  log likelihood = 642.26
## AIC=-1282.51   AICc=-1282.5   BIC=-1278.77
## 
## Training set error measures:
##                        ME       RMSE        MAE MPE MAPE      MASE        ACF1
## Training set 0.0001781316 0.03088612 0.02212478 100  100 0.6591839 -0.06952365

Se busca hacer un modelo ARIMA en el que la serie de tiempo se vuelva estacionaria. Como se puede observar, se incluye un diferenciador. Asimismo, se indica que la serie tiene un componente estacional, como fue visto en la descomposiciĂłn de la serie de tiempo.

modelo1<-auto.arima(ts_adj_close,stationary=F,seasonal=T,stepwise=T)
summary(modelo1) 
## Series: ts_adj_close 
## ARIMA(0,1,0) 
## 
## sigma^2 = 19.44:  log likelihood = -905.61
## AIC=1813.22   AICc=1813.23   BIC=1816.96
## 
## Training set error measures:
##                     ME     RMSE      MAE         MPE     MAPE      MASE
## Training set 0.0262102 4.402009 3.246531 -0.02933105 2.204892 0.1660135
##                     ACF1
## Training set -0.07759413

Se busca hacer el modelo sin el componente estacional y se encuentran los mismos resultados que en el modelo anterior.

modelo2<-auto.arima(ts_adj_close,stationary=F,seasonal=F,stepwise=T)
summary(modelo2) 
## Series: ts_adj_close 
## ARIMA(0,1,0) 
## 
## sigma^2 = 19.44:  log likelihood = -905.61
## AIC=1813.22   AICc=1813.23   BIC=1816.96
## 
## Training set error measures:
##                     ME     RMSE      MAE         MPE     MAPE      MASE
## Training set 0.0262102 4.402009 3.246531 -0.02933105 2.204892 0.1660135
##                     ACF1
## Training set -0.07759413
auto.arima(ts_adj_close, trace=T)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2)(1,0,1)[52] with drift         : 1854.148
##  ARIMA(0,1,0)            with drift         : 1812.282
##  ARIMA(1,1,0)(1,0,0)[52] with drift         : 1849.155
##  ARIMA(0,1,1)(0,0,1)[52] with drift         : 1814.061
##  ARIMA(0,1,0)                               : 1810.267
##  ARIMA(0,1,0)(1,0,0)[52] with drift         : 1847.536
##  ARIMA(0,1,0)(0,0,1)[52] with drift         : 1814.032
##  ARIMA(0,1,0)(1,0,1)[52] with drift         : 1846.692
##  ARIMA(1,1,0)            with drift         : 1813.274
##  ARIMA(0,1,1)            with drift         : 1812.466
##  ARIMA(1,1,1)            with drift         : 1815.14
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,1,0)                               : 1813.234
## 
##  Best model: ARIMA(0,1,0)
## Series: ts_adj_close 
## ARIMA(0,1,0) 
## 
## sigma^2 = 19.44:  log likelihood = -905.61
## AIC=1813.22   AICc=1813.23   BIC=1816.96

Se utilizan los modelos con mejor desempeño de la función anterior y se encuentra un componente de diferenciación y uno de medias móviles.

modelo3 <- arima(ts_adj_close, order = c(0,1,1))
summary(modelo3) 
## 
## Call:
## arima(x = ts_adj_close, order = c(0, 1, 1))
## 
## Coefficients:
##           ma1
##       -0.0763
## s.e.   0.0560
## 
## sigma^2 estimated as 19.32:  log likelihood = -904.69,  aic = 1813.37
## 
## Training set error measures:
##                      ME    RMSE     MAE         MPE     MAPE      MASE
## Training set 0.02882179 4.38895 3.21987 -0.03086032 2.185639 0.9887563
##                       ACF1
## Training set -0.0009167525

Se busca comparar con otro modelo para poder seleccionar el de mejor desempeño. Tomando en cuenta las medidas de RMSE y AIC. Considerando esos valores el mejor modelo sería uno con un componente autorregresivo y uno de diferenciación. Es decir, un modelo ARIMA(1,1,0).

modelo4 <- arima(ts_adj_close, order = c(1,1,0))
summary(modelo4) 
## 
## Call:
## arima(x = ts_adj_close, order = c(1, 1, 0))
## 
## Coefficients:
##           ar1
##       -0.0774
## s.e.   0.0564
## 
## sigma^2 estimated as 19.32:  log likelihood = -904.67,  aic = 1813.34
## 
## Training set error measures:
##                      ME     RMSE     MAE         MPE     MAPE      MASE
## Training set 0.02873053 4.388722 3.21953 -0.03070237 2.185218 0.9886519
##                      ACF1
## Training set 0.0004438761

El modelo tiene un valor de p mayor a 0.05 en la prueba de Box-Ljung, lo que indica que los residuales son independientes, por lo que se estĂĄ trabajando con un buen modelo ya que tiene ruido blanco.

tsdiag(modelo4)

Box.test(residuals(modelo4), type= "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(modelo4)
## X-squared = 6.2262e-05, df = 1, p-value = 0.9937
error=residuals(modelo4)
plot(error)

Se puede observar que los residuales del modelo son estacionarios, lo cual es bueno porque indica mejores precisiones.

residuos <- residuals(modelo4)
adf.test(residuos)
## Warning in adf.test(residuos): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuos
## Dickey-Fuller = -7.103, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
pronostico <- forecast::forecast(modelo4, h=10)
plot(pronostico)

femsa2024$Date<-as.Date(femsa2024$Date)
femsa2024
##          Date   Open   High    Low  Close Adj.Close   Volume
## 1  2024-01-01 221.46 223.00 211.56 213.45    213.45  5479142
## 2  2024-01-08 214.65 220.26 214.23 219.20    219.20  7690391
## 3  2024-01-15 218.00 229.07 217.60 228.25    228.25 13469639
## 4  2024-01-22 228.00 237.61 227.03 233.85    233.85  9995229
## 5  2024-01-29 233.58 243.90 232.34 241.52    241.52  8962110
## 6  2024-02-05 242.00 244.94 236.00 242.58    242.58 13399393
## 7  2024-02-12 242.28 245.00 224.99 227.51    227.51  7333646
## 8  2024-02-19 229.92 229.92 200.66 203.40    203.40 18807624
## 9  2024-02-26 204.00 215.28 200.00 212.28    212.28 25730330
## 10 2024-03-04 210.26 213.06 209.56 210.22    210.22  3081163
df<- data.frame(date=femsa2024$Date, pronostico=pronostico$mean, real=femsa2024$Adj.Close)
df
##          date pronostico   real
## 1  2024-01-01   149.3112 213.45
## 2  2024-01-08   149.2980 219.20
## 3  2024-01-15   149.2990 228.25
## 4  2024-01-22   149.2990 233.85
## 5  2024-01-29   149.2990 241.52
## 6  2024-02-05   149.2990 242.58
## 7  2024-02-12   149.2990 227.51
## 8  2024-02-19   149.2990 203.40
## 9  2024-02-26   149.2990 212.28
## 10 2024-03-04   149.2990 210.22

Como se puede observar, el pronóstico del ARIMA no fue preciso. En realidad los valores del pronóstico se mantuvieron bajos y constantes mientras que en realidad hubo un incremento y después un declive en los precios de las acciones.

plot_data <- ggplot(femsa, aes(x = Date, y = Adj.Close)) +
  geom_line(color = "blue") +
  labs(x = "Date", y = "Adjusted Close", title = "Original Data until 2023") +
  theme_minimal()
plot_data <- plot_data +
  geom_line(data = femsa2024, aes(x = Date, y = Adj.Close), color = "green") +
  geom_line(data = df, aes(x = date, y = pronostico), color = "red") + 
  labs(color = "Lines", title = "Original Data until 2023 with New Values from 2024 and ARIMA Forecast") +
  scale_color_manual(values = c("blue", "green", "red")) 
print(plot_data)